library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
df = read.csv("marketing_campaign.csv", sep = "\t", header=T)
head(df)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
# Fill NA on variable "Income"
df <- df %>%
group_by(Education) %>%
mutate(Income = ifelse(is.na(Income), mean(Income, na.rm = TRUE), Income)) %>%
ungroup()
# Convert 'Year_Birth' to 'Age' for better understanding
df$Age <- 2024 - df$Year_Birth
df <- df[, !(names(df) %in% "Year_Birth")] # Remove the original 'Year_Birth' column
# Merge 'Kidhome' and 'Teenhome' into 'Children'
df$Children <- df$Kidhome + df$Teenhome
df <- df[, !(names(df) %in% c("Kidhome", "Teenhome"))] # Remove the original 'Kidhome' and 'Teenhome' columns
# Referring to the parenthood status
df$Parental_Status <- as.integer(df$Children != 0)
# Count Customer's total spent
df$Total_Spent <- df$MntWines + df$MntFruits + df$MntMeatProducts +
df$MntFishProducts + df$MntSweetProducts + df$MntGoldProds
# Count total Cmp accepted by Customer
df$Total_Offer <- df$AcceptedCmp1 + df$AcceptedCmp2 + df$AcceptedCmp3 +
df$AcceptedCmp4 + df$AcceptedCmp5
# Count total purchases by all methods
df$Num_Total_Purchases <- df$NumWebPurchases + df$NumCatalogPurchases +
df$NumStorePurchases + df$NumDealsPurchases
head(df)
## # A tibble: 6 × 32
## ID Education Marital_Status Income Dt_Customer Recency MntWines MntFruits
## <int> <chr> <chr> <dbl> <chr> <int> <int> <int>
## 1 5524 Graduation Single 58138 04-09-2012 58 635 88
## 2 2174 Graduation Single 46344 08-03-2014 38 11 1
## 3 4141 Graduation Together 71613 21-08-2013 26 426 49
## 4 6182 Graduation Together 26646 10-02-2014 26 11 4
## 5 5324 PhD Married 58293 19-01-2014 94 173 43
## 6 7446 Master Together 62513 09-09-2013 16 520 42
## # ℹ 24 more variables: MntMeatProducts <int>, MntFishProducts <int>,
## # MntSweetProducts <int>, MntGoldProds <int>, NumDealsPurchases <int>,
## # NumWebPurchases <int>, NumCatalogPurchases <int>, NumStorePurchases <int>,
## # NumWebVisitsMonth <int>, AcceptedCmp3 <int>, AcceptedCmp4 <int>,
## # AcceptedCmp5 <int>, AcceptedCmp1 <int>, AcceptedCmp2 <int>, Complain <int>,
## # Z_CostContact <int>, Z_Revenue <int>, Response <int>, Age <dbl>,
## # Children <int>, Parental_Status <int>, Total_Spent <int>, …
Descriptive Statistics:
# numerical variables
print('Basic Statistic for numurical variables')
## [1] "Basic Statistic for numurical variables"
summary(df)
## ID Education Marital_Status Income
## Min. : 0 Length:2240 Length:2240 Min. : 1730
## 1st Qu.: 2828 Class :character Class :character 1st Qu.: 35539
## Median : 5458 Mode :character Mode :character Median : 51610
## Mean : 5592 Mean : 52254
## 3rd Qu.: 8428 3rd Qu.: 68290
## Max. :11191 Max. :666666
## Dt_Customer Recency MntWines MntFruits
## Length:2240 Min. : 0.00 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.:24.00 1st Qu.: 23.75 1st Qu.: 1.0
## Mode :character Median :49.00 Median : 173.50 Median : 8.0
## Mean :49.11 Mean : 303.94 Mean : 26.3
## 3rd Qu.:74.00 3rd Qu.: 504.25 3rd Qu.: 33.0
## Max. :99.00 Max. :1493.00 Max. :199.0
## MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
## Min. : 0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 16 1st Qu.: 3.00 1st Qu.: 1.00 1st Qu.: 9.00
## Median : 67 Median : 12.00 Median : 8.00 Median : 24.00
## Mean : 167 Mean : 37.53 Mean : 27.06 Mean : 44.02
## 3rd Qu.: 232 3rd Qu.: 50.00 3rd Qu.: 33.00 3rd Qu.: 56.00
## Max. :1725 Max. :259.00 Max. :263.00 Max. :362.00
## NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 3.00
## Median : 2.000 Median : 4.000 Median : 2.000 Median : 5.00
## Mean : 2.325 Mean : 4.085 Mean : 2.662 Mean : 5.79
## 3rd Qu.: 3.000 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 8.00
## Max. :15.000 Max. :27.000 Max. :28.000 Max. :13.00
## NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## Min. : 0.000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 6.000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean : 5.317 Mean :0.07277 Mean :0.07455 Mean :0.07277
## 3rd Qu.: 7.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :20.000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact
## Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :3
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:3
## Median :0.00000 Median :0.00000 Median :0.000000 Median :3
## Mean :0.06429 Mean :0.01339 Mean :0.009375 Mean :3
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:3
## Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :3
## Z_Revenue Response Age Children
## Min. :11 Min. :0.0000 Min. : 28.00 Min. :0.0000
## 1st Qu.:11 1st Qu.:0.0000 1st Qu.: 47.00 1st Qu.:0.0000
## Median :11 Median :0.0000 Median : 54.00 Median :1.0000
## Mean :11 Mean :0.1491 Mean : 55.19 Mean :0.9504
## 3rd Qu.:11 3rd Qu.:0.0000 3rd Qu.: 65.00 3rd Qu.:1.0000
## Max. :11 Max. :1.0000 Max. :131.00 Max. :3.0000
## Parental_Status Total_Spent Total_Offer Num_Total_Purchases
## Min. :0.0000 Min. : 5.00 Min. :0.0000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 68.75 1st Qu.:0.0000 1st Qu.: 8.00
## Median :1.0000 Median : 396.00 Median :0.0000 Median :15.00
## Mean :0.7152 Mean : 605.80 Mean :0.2978 Mean :14.86
## 3rd Qu.:1.0000 3rd Qu.:1045.50 3rd Qu.:0.0000 3rd Qu.:21.00
## Max. :1.0000 Max. :2525.00 Max. :4.0000 Max. :44.00
# categorical variables
print("Basic Statistic for categorical variables")
## [1] "Basic Statistic for categorical variables"
table_counts <- table(df$Education)
percentage_counts <- prop.table(table_counts) * 100
summary_table <- data.frame(Counts = table_counts, Percentages = percentage_counts)
print(summary_table)
## Counts.Var1 Counts.Freq Percentages.Var1 Percentages.Freq
## 1 2n Cycle 203 2n Cycle 9.062500
## 2 Basic 54 Basic 2.410714
## 3 Graduation 1127 Graduation 50.312500
## 4 Master 370 Master 16.517857
## 5 PhD 486 PhD 21.696429
table_counts <- table(df$Marital_Status)
percentage_counts <- prop.table(table_counts) * 100
summary_table <- data.frame(Counts = table_counts, Percentages = percentage_counts)
print(summary_table)
## Counts.Var1 Counts.Freq Percentages.Var1 Percentages.Freq
## 1 Absurd 2 Absurd 0.08928571
## 2 Alone 3 Alone 0.13392857
## 3 Divorced 232 Divorced 10.35714286
## 4 Married 864 Married 38.57142857
## 5 Single 480 Single 21.42857143
## 6 Together 580 Together 25.89285714
## 7 Widow 77 Widow 3.43750000
## 8 YOLO 2 YOLO 0.08928571
# Plotting Distribution
# Specify numeric features
numeric_features <- c('Total_Spent', 'Num_Total_Purchases', 'Age', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')
# Loop through numeric features to create histograms
for (feature in numeric_features) {
# Set up the plot
plt <- ggplot(df, aes(x = .data[[feature]])) +
geom_histogram(binwidth = (max(df[[feature]] ) - min(df[[feature]])) / 30, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of", feature),
x = feature,
y = "Frequency") +
theme_minimal()
# Show the plot
print(plt)
}
# Specify categorical features
categorical_features <- c('Education', 'Marital_Status')
# Loop through categorical features to create count plots
for (feature in categorical_features) {
# Set up the plot
plt <- ggplot(df, aes(x = .data[[feature]])) +
geom_bar(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Count of customers by", feature),
x = feature,
y = "Count") +
theme_minimal()
# Show the plot
print(plt)
}
# Explore relationship
# Scatter Plot
# Select the numeric features
numeric_features <- c('Total_Spent', 'Num_Total_Purchases', 'Age', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')
# Create scatter plots for each numeric feature against Total_Spent
for (feature in numeric_features) {
scatter_plot <- ggplot(df, aes(x = df[[feature]], y = Total_Spent)) +
geom_point() +
labs(title = paste("Scatter Plot between", feature, "and Total_Spent"),
x = feature,
y = "Total_Spent") +
theme_minimal()
print(scatter_plot)
}
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.
# Analyze "Income" on diff "Education"
# Set up the plot with group means
plt <- ggplot(df, aes(x = Education, y = Income, fill = Marital_Status)) +
stat_summary(fun = mean, geom = "bar", position = "dodge") +
labs(title = "Mean Income Across Demographic Groups",
x = "Education",
y = "Mean Income",
fill = "Marital Status") +
theme_minimal()
# Show the plot
print(plt)
# Analyze "Total Spend" on diff "Education"
# Set up the plot with group means
plt <- ggplot(df, aes(x = Education, y = Total_Spent, fill = Marital_Status)) +
stat_summary(fun = mean, geom = "bar", position = "dodge") +
labs(title = "Mean Income Across Demographic Groups",
x = "Education",
y = "Mean Total_Spent",
fill = "Marital Status") +
theme_minimal()
# Show the plot
print(plt)
# Purchasing Behavior Analysis - From the pie chart, we can deduce that
the majority of revenue comes from the consumption of alcoholic
beverages. - Furthermore, from the second and third charts, it is
evident that the largest consumer group for alcoholic beverages consists
of individuals with a marital status of “Married,” “Single,” or an
educational background of “University.
# Specify the product categories
product_categories <- c('MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')
# Create pie charts for each product category
# Calculate percentage of spending for each product category
percentage_spending <- colMeans(df[, product_categories]) / sum(colMeans(df[, product_categories])) * 100
# Create a data frame for the pie chart
pie_data <- data.frame(category = names(percentage_spending), percentage = percentage_spending)
# Create a pie chart
pie_chart <- ggplot(pie_data, aes(x = "", y = percentage, fill = category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y") +
labs(title = "Percentage of Spending on Different Product Categories",
x = NULL,
y = NULL) +
theme_minimal() +
theme(legend.position = "bottom")
# Show the pie chart
print(pie_chart)
# Calculate percentage of spending for each product category by Marital_Status
percentage_spending <- aggregate(df[, product_categories], by = list(df$Marital_Status), FUN = function(x) sum(x) / sum(df[, product_categories]) * 100)
# Reshape the data for plotting
percentage_spending_long <- tidyr::gather(percentage_spending, key = "Product_Category", value = "Percentage", -Group.1)
# Create a bar plot
bar_plot <- ggplot(percentage_spending_long, aes(x = Product_Category, y = Percentage, fill = factor(Group.1))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Percentage of Spending on Different Product Categories by Marital Status",
x = "Product Category",
y = "Percentage",
fill = "Marital Status") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar_plot)
# Calculate percentage of spending for each product category by "Education"
percentage_spending <- aggregate(df[, product_categories], by = list(df$Education), FUN = function(x) sum(x) / sum(df[, product_categories]) * 100)
# Reshape the data for plotting
percentage_spending_long <- tidyr::gather(percentage_spending, key = "Product_Category", value = "Percentage", -Group.1)
# Create a bar plot
bar_plot <- ggplot(percentage_spending_long, aes(x = Product_Category, y = Percentage, fill = factor(Group.1))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Percentage of Spending on Different Product Categories by Education",
x = "Product Category",
y = "Percentage",
fill = "Education") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar_plot)
# Box plot
box_plot <- ggplot(df, aes(x = as.factor(Children), y = Total_Spent)) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Total Spending by Number of Children/Teenagers",
x = "Number of Children/Teenagers",
y = "Total Spending") +
theme_minimal()
# Show the plots
print(box_plot)
# Create box plots for each purchase channel
purchase_channels <- c('NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases')
# Create box plots
box_plots <- list()
for (channel in purchase_channels) {
box_plot <- ggplot(df, aes(x = as.factor(channel), y = df[[channel]])) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of Purchases through", channel),
x = "Purchase Channel",
y = "Number of Purchases") +
theme_minimal()
box_plots[[channel]] <- box_plot
}
# Print the box plots
for (channel in purchase_channels) {
print(box_plots[[channel]])
}
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
# Plotting box plot for purchase chanel by Marital_Status
for (channel in purchase_channels) {
box_plot <- ggplot(df, aes(x = Marital_Status, y = df[[channel]])) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of", channel, "by Marital_Status"),
x = "Marital_Status",
y = paste("Number of", channel)) +
theme_minimal()
# Show the box plot for each channel
print(box_plot)
}
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.
# Analyze the distribution of deals purchases
# Create a box plot for deals purchases by Marital_Status
box_plot <- ggplot(df, aes(x = Marital_Status, y = NumDealsPurchases)) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Deals Purchases by Marital_Status",
x = "Marital_Status",
y = "Number of Deals Purchases") +
theme_minimal()
# Show the box plot
print(box_plot)
# Hypothesis Testing Income Affects Spending
From the results, it is evident that there is a significant positive correlation in the statistical analysis between the income level of consumers and their expenditure on alcoholic beverages and meat products.
# Hypothesis: Income Affects Spending
# Create a new variable 'IncomeGroup' based on income levels
customer_data <- df %>%
mutate(IncomeGroup = cut(Income, breaks = c(0, 30000, 60000, 90000, 120000, Inf),
labels = c("0-30k", "30k-60k", "60k-90k", "90k-120k", "120k+"),
include.lowest = TRUE))
# Check the summary statistics for the spending variables by income group
summary_table <- customer_data %>%
group_by(IncomeGroup) %>%
summarise(
Mean_Wines = mean(MntWines),
Mean_Fruits = mean(MntFruits),
Mean_MeatProducts = mean(MntMeatProducts),
Mean_FishProducts = mean(MntFishProducts),
Mean_SweetProducts = mean(MntSweetProducts),
Mean_GoldProds = mean(MntGoldProds)
)
# Print summary table
print(summary_table)
## # A tibble: 5 × 7
## IncomeGroup Mean_Wines Mean_Fruits Mean_MeatProducts Mean_FishProducts
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0-30k 13.8 5.75 21.5 8.01
## 2 30k-60k 171. 10.7 61.6 16.2
## 3 60k-90k 587. 54.3 344. 76.0
## 4 90k-120k 820. 66.8 599. 101.
## 5 120k+ 26.5 4.5 622. 4.25
## # ℹ 2 more variables: Mean_SweetProducts <dbl>, Mean_GoldProds <dbl>
# Perform ANOVA test for each spending variable
# Example for 'Wines' spending
anova_result_wines <- aov(MntWines ~ IncomeGroup, data = customer_data)
print(summary(anova_result_wines))
## Df Sum Sq Mean Sq F value Pr(>F)
## IncomeGroup 4 124853862 31213465 541.5 <2e-16 ***
## Residuals 2235 128819923 57638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Repeat the ANOVA test for other spending variables
# Example for 'Fruits' spending
anova_result_fruits <- aov(MntFruits ~ IncomeGroup, data = customer_data)
print(summary(anova_result_fruits))
## Df Sum Sq Mean Sq F value Pr(>F)
## IncomeGroup 4 1101648 275412 252.2 <2e-16 ***
## Residuals 2235 2440285 1092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Repeat for other spending variables
# Post-hoc pairwise t-tests to identify specific income groups that differ significantly
# Example for 'Wines' spending
posthoc_wines <- TukeyHSD(anova_result_wines)
print(posthoc_wines)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MntWines ~ IncomeGroup, data = customer_data)
##
## $IncomeGroup
## diff lwr upr p adj
## 30k-60k-0-30k 157.54739 117.8176 197.27719 0.0000000
## 60k-90k-0-30k 572.87628 531.5709 614.18163 0.0000000
## 90k-120k-0-30k 806.70781 703.2334 910.18221 0.0000000
## 120k+-0-30k 12.71892 -221.4968 246.93465 0.9998916
## 60k-90k-30k-60k 415.32889 384.3031 446.35466 0.0000000
## 90k-120k-30k-60k 649.16041 549.3435 748.97734 0.0000000
## 120k+-30k-60k -144.82847 -377.4515 87.79455 0.4343004
## 90k-120k-60k-90k 233.83153 133.3771 334.28597 0.0000000
## 120k+-60k-90k -560.15736 -793.0547 -327.26007 0.0000000
## 120k+-90k-120k -793.98889 -1045.4684 -542.50933 0.0000000
# Repeat post-hoc tests for other spending variables
# Example for 'Fruits' spending
posthoc_fruits <- TukeyHSD(anova_result_fruits)
print(posthoc_fruits)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MntFruits ~ IncomeGroup, data = customer_data)
##
## $IncomeGroup
## diff lwr upr p adj
## 30k-60k-0-30k 4.928495 -0.539713 10.39670 0.1001278
## 60k-90k-0-30k 48.524359 42.839299 54.20942 0.0000000
## 90k-120k-0-30k 61.054054 46.812359 75.29575 0.0000000
## 120k+-0-30k -1.245946 -33.482216 30.99032 0.9999720
## 60k-90k-30k-60k 43.595863 39.325632 47.86609 0.0000000
## 90k-120k-30k-60k 56.125559 42.387261 69.86386 0.0000000
## 120k+-30k-60k -6.174441 -38.191500 25.84262 0.9846753
## 90k-120k-60k-90k 12.529695 -1.296347 26.35574 0.0969113
## 120k+-60k-90k -49.770305 -81.825112 -17.71550 0.0002265
## 120k+-90k-120k -62.300000 -96.912377 -27.68762 0.0000095
Education Influences Total Spending - Null Hypothesis (H0): Education level does not impact total spending. - Alternative Hypothesis (H1): Customers with higher education levels have a higher total spending.
The results indicate that there is a statistically significant positive correlation between the educational background of consumers and their expenditure.
# Hypothesis: Education Influences Total Spending
# Assuming your dataframe is named 'customer_data'
# Create a new variable 'EducationGroup' based on education levels
customer_data <- customer_data %>%
mutate(EducationGroup = as.factor(Education))
# Check unique values in the 'Education' column and recode if necessary
unique(customer_data$Education)
## [1] "Graduation" "PhD" "Master" "Basic" "2n Cycle"
# Recode if needed (replace 'Basic' with a specific level)
# customer_data$Education <- recode(customer_data$Education, 'Basic' = 'Basic_Level')
# Check the summary statistics for the spending variables by education group
summary_table_education <- customer_data %>%
group_by(EducationGroup) %>%
summarise(
Total_Spending = MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'EducationGroup'. You can override using
## the `.groups` argument.
# Print summary table
print(summary_table_education)
## # A tibble: 2,240 × 2
## # Groups: EducationGroup [5]
## EducationGroup Total_Spending
## <fct> <int>
## 1 2n Cycle 133
## 2 2n Cycle 81
## 3 2n Cycle 122
## 4 2n Cycle 1274
## 5 2n Cycle 109
## 6 2n Cycle 18
## 7 2n Cycle 79
## 8 2n Cycle 978
## 9 2n Cycle 72
## 10 2n Cycle 577
## # ℹ 2,230 more rows
# Perform ANOVA test for total spending
anova_result_education <- aov(Total_Spent ~ EducationGroup, data = customer_data)
print(summary(anova_result_education))
## Df Sum Sq Mean Sq F value Pr(>F)
## EducationGroup 4 19644802 4911201 13.85 3.66e-11 ***
## Residuals 2235 792449913 354564
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc pairwise t-tests to identify specific education groups that differ significantly
posthoc_education <- TukeyHSD(anova_result_education)
print(posthoc_education)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Total_Spent ~ EducationGroup, data = customer_data)
##
## $EducationGroup
## diff lwr upr p adj
## Basic-2n Cycle -414.730797 -663.6349059 -165.82669 0.0000557
## Graduation-2n Cycle 123.371753 -0.5724578 247.31596 0.0517520
## Master-2n Cycle 115.253987 -26.7298247 257.23780 0.1740320
## PhD-2n Cycle 175.882371 40.0341803 311.73056 0.0038144
## Graduation-Basic 538.102550 311.6503663 764.55473 0.0000000
## Master-Basic 529.984785 293.1772106 766.79236 0.0000000
## PhD-Basic 590.613169 357.4326479 823.79369 0.0000000
## Master-Graduation -8.117765 -105.5176591 89.28213 0.9994067
## PhD-Graduation 52.510619 -35.7054083 140.72665 0.4814023
## PhD-Master 60.628384 -51.5291078 172.78588 0.5784234
Parenthood Affects Total Spending - Null Hypothesis (H0): There is no significant difference in total spending between customers with and without children. - Alternative Hypothesis (H1): Customers with children have different total spending habits compared to customers without children.
From the previous box plot, we observed a negative correlation between the number of children and expenditure. We can further validate this using a t-test. The results indicate that consumers with children have significantly lower total expenditure compared to those without children.
# Hypothesis: Parenthood Affects Total Spending
# Assuming your dataframe is named 'customer_data'
# Create a new binary variable 'HasChildren' indicating whether the customer has children
customer_data <- df %>%
mutate(HasChildren = ifelse(Children > 0, "With Children", "Without Children"))
# Check the summary statistics for the spending variables by parenthood status
summary_table_parenthood <- customer_data %>%
group_by(HasChildren) %>%
summarise(
Total_Spending = MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'HasChildren'. You can override using the
## `.groups` argument.
# Perform t-test for total spending between customers with and without children
t_test_result <- t.test(Total_Spent ~ Parental_Status, data = customer_data)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: Total_Spent by Parental_Status
## t = 25.071, df = 894.24, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 644.6949 754.2049
## sample estimates:
## mean in group 0 mean in group 1
## 1106.0298 406.5799
Age and Spending Habits - Null Hypothesis (H0): There is no correlation between age and total spending. - Alternative Hypothesis (H1): Younger customers exhibit different spending patterns compared to older customers.
The t-value is 5.2986, and the corresponding p-value is 1.282e-07. This extremely small p-value indicates that we can reject the null hypothesis, i.e., the true correlation is not equal to 0. This implies that the correlation between age and total spending is statistically significant.
# Hypothesis: Age and Spending Habits
# Check the correlation between age and total spending
correlation_result <- cor.test(df$Age,
df$Total_Spent,
method = "pearson")
# Print correlation result
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: df$Age and df$Total_Spent
## t = 5.2986, df = 2238, p-value = 1.282e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07021422 0.15202145
## sample estimates:
## cor
## 0.1113064
Web Visits Influence Web Purchases - Null Hypothesis (H0): The number of web visits does not affect the number of web purchases. - Alternative Hypothesis (H1): Customers who visit the website more frequently are more likely to make web purchases.
From this test, we can observe a statistically significant negative correlation between the number of website visits and the number of website purchases.
One possible explanation is that consumers who spend more time browsing online may engage in window shopping or informational searches without necessarily intending to make immediate purchases. On the other hand, those who spend less time browsing might have clearer preferences or specific purchase intentions, leading to more direct and purposeful buying behavior.
# Hypothesis: Web Visits Influence Web Purchases
# Check the correlation between web visits and web purchases
correlation_result <- cor.test(df$NumWebVisitsMonth,
df$NumWebPurchases,
method = "pearson")
# Print correlation result
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: df$NumWebVisitsMonth and df$NumWebPurchases
## t = -2.6461, df = 2238, p-value = 0.0082
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09703774 -0.01446393
## sample estimates:
## cor
## -0.05584633
# Perform linear regression
regression_model <- lm(NumWebPurchases ~ NumWebVisitsMonth, data = customer_data)
# Print regression summary
summary(regression_model)
##
## Call:
## lm(formula = NumWebPurchases ~ NumWebVisitsMonth, data = customer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4248 -2.0411 -0.3609 1.7670 22.6391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.42481 0.14123 31.330 <2e-16 ***
## NumWebVisitsMonth -0.06395 0.02417 -2.646 0.0082 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.775 on 2238 degrees of freedom
## Multiple R-squared: 0.003119, Adjusted R-squared: 0.002673
## F-statistic: 7.002 on 1 and 2238 DF, p-value: 0.0082